{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import gc\n",
    "import os\n",
    "from pathlib import Path\n",
    "import random\n",
    "import sys\n",
    "\n",
    "from tqdm import tqdm_notebook as tqdm\n",
    "import numpy as np # linear algebra\n",
    "import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "\n",
    "from IPython.core.display import display, HTML\n",
    "\n",
    "\n",
    "# --- models ---\n",
    "from sklearn import preprocessing\n",
    "from sklearn.model_selection import KFold\n",
    "\n",
    "from sklearn.metrics import mean_squared_error"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pandas.api.types import is_datetime64_any_dtype as is_datetime\n",
    "from pandas.api.types import is_categorical_dtype\n",
    "\n",
    "def reduce_mem_usage(df, use_float16=False):\n",
    "    \"\"\" iterate through all the columns of a dataframe and modify the data type\n",
    "        to reduce memory usage.        \n",
    "    \"\"\"\n",
    "    start_mem = df.memory_usage().sum() / 1024**2\n",
    "    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))\n",
    "    \n",
    "    for col in df.columns:\n",
    "        if is_datetime(df[col]) or is_categorical_dtype(df[col]):\n",
    "            # skip datetime type or categorical type\n",
    "            continue\n",
    "        col_type = df[col].dtype\n",
    "        \n",
    "        if col_type != object:\n",
    "            c_min = df[col].min()\n",
    "            c_max = df[col].max()\n",
    "            if str(col_type)[:3] == 'int':\n",
    "                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:\n",
    "                    df[col] = df[col].astype(np.int8)\n",
    "                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:\n",
    "                    df[col] = df[col].astype(np.int16)\n",
    "                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:\n",
    "                    df[col] = df[col].astype(np.int32)\n",
    "                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:\n",
    "                    df[col] = df[col].astype(np.int64)  \n",
    "            else:\n",
    "                if use_float16 and c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:\n",
    "                    df[col] = df[col].astype(np.float16)\n",
    "                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:\n",
    "                    df[col] = df[col].astype(np.float32)\n",
    "                else:\n",
    "                    df[col] = df[col].astype(np.float64)\n",
    "        else:\n",
    "            df[col] = df[col].astype('category')\n",
    "\n",
    "    end_mem = df.memory_usage().sum() / 1024**2\n",
    "    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))\n",
    "    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))\n",
    "    \n",
    "    return df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/local/home/ningzesun/.local/lib/python3.6/site-packages/numpy/lib/arraysetops.py:568: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison\n",
      "  mask |= (ar1 == a)\n"
     ]
    }
   ],
   "source": [
    "sub_path = '../../../Large_output/good_submission/re_submission/'\n",
    "sample_submission1 = pd.read_csv(sub_path + 'cat_1.074.csv', index_col=0)\n",
    "sample_submission2 = pd.read_csv(sub_path + 'lgb3_1.081.csv', index_col=0)\n",
    "sample_submission3 = pd.read_csv(sub_path + 'lgb_half_1.089.csv', index_col=0)\n",
    "sample_submission4 = pd.read_csv(sub_path + 'lgb3_1.082.csv', index_col=0)\n",
    "sample_submission5 = pd.read_csv(sub_path + 'xgb_new_noM_1.084.csv', index_col=0)\n",
    "sample_submission6 = pd.read_csv(sub_path + 'nn_clean_1.100.csv', index_col=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>row_id</th>\n",
       "      <th>meter_reading</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>196.794798</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>98.870458</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>23.472831</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>276.100799</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4</td>\n",
       "      <td>1159.726712</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>41697595</th>\n",
       "      <td>41697595</td>\n",
       "      <td>6.584752</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>41697596</th>\n",
       "      <td>41697596</td>\n",
       "      <td>4.813977</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>41697597</th>\n",
       "      <td>41697597</td>\n",
       "      <td>4.354947</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>41697598</th>\n",
       "      <td>41697598</td>\n",
       "      <td>176.032612</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>41697599</th>\n",
       "      <td>41697599</td>\n",
       "      <td>4.155505</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>41697600 rows × 2 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "            row_id  meter_reading\n",
       "0                0     196.794798\n",
       "1                1      98.870458\n",
       "2                2      23.472831\n",
       "3                3     276.100799\n",
       "4                4    1159.726712\n",
       "...            ...            ...\n",
       "41697595  41697595       6.584752\n",
       "41697596  41697596       4.813977\n",
       "41697597  41697597       4.354947\n",
       "41697598  41697598     176.032612\n",
       "41697599  41697599       4.155505\n",
       "\n",
       "[41697600 rows x 2 columns]"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sample_submission = sample_submission1.reset_index()\n",
    "sample_submission"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "sample_submission1['log_meter_reading']=np.log1p(sample_submission1['meter_reading'])\n",
    "sample_submission2['log_meter_reading']=np.log1p(sample_submission2['meter_reading'])\n",
    "sample_submission3['log_meter_reading']=np.log1p(sample_submission3['meter_reading'])\n",
    "sample_submission4['log_meter_reading']=np.log1p(sample_submission4['meter_reading'])\n",
    "sample_submission5['log_meter_reading']=np.log1p(sample_submission5['meter_reading'])\n",
    "sample_submission6['log_meter_reading']=np.log1p(sample_submission6['meter_reading'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "ens=pd.DataFrame()\n",
    "ens['row_id']=sample_submission['row_id']\n",
    "ens['meter_reading']=np.expm1((sample_submission1['log_meter_reading']+sample_submission2['log_meter_reading']\\\n",
    "                             +sample_submission3['log_meter_reading']+sample_submission4['log_meter_reading']\\\n",
    "                             +sample_submission5['log_meter_reading']+sample_submission6['log_meter_reading'])/6.0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "ens.to_csv(\"../../../Large_output/ensemble_avg_no_leaking.csv\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
